##Importing Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(shiny)
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(leaps)
library(knitr)
library(ggplot2)
library("reshape2")
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(class)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tree)
library(rpart)
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:rattle':
##
## importance
##
## The following object is masked from 'package:psych':
##
## outlier
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:dplyr':
##
## combine
library(readxl)
library(moments)
library(FactoMineR)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following object is masked from 'package:bitops':
##
## %&%
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
library(corrplot)
## corrplot 0.92 loaded
library(rsample)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
dataset_s1 = read.csv("/Users/jasonrayen/Downloads/Jason Masters/Sem 3/AIT 664 (Information, Visualization & Representation)/Project/dataset_s1_cleaned.csv")
dataset_before_cleaning_s1 = read_xlsx("/Users/jasonrayen/Downloads/Jason Masters/Sem 3/AIT 664 (Information, Visualization & Representation)/Project/S1 File.xlsx")
table(is.na(dataset_s1))
##
## FALSE
## 31239
No Null values
#Removing 2 values that are preventing The.number.of.crime.incidents from being a number
dataset_s1$The.number.of.crime.incidents = ifelse(dataset_s1$The.number.of.crime.incidents == '-', NA, dataset_s1$The.number.of.crime.incidents)
dataset_s1 = na.omit(dataset_s1)
dataset_s1$The.number.of.crime.incidents = as.numeric(dataset_s1$The.number.of.crime.incidents)
dataset_s1 <- mutate_all(dataset_s1, as.numeric)
dataset_before_cleaning_s1$`The number of crime incidents` =
ifelse(dataset_before_cleaning_s1$`The number of crime incidents` == '-', NA,
dataset_before_cleaning_s1$`The number of crime incidents`)
dataset_before_cleaning_s1 = na.omit(dataset_before_cleaning_s1)
dataset_before_cleaning_s1$`The number of crime incidents` = as.numeric(dataset_before_cleaning_s1$`The number of crime incidents`)
str(dataset_s1)
## 'data.frame': 799 obs. of 39 variables:
## $ Census.tract : num 101 102 102 103 104 ...
## $ The.number.of.crime.incidents : num 564 452 458 360 225 192 110 110 367 175 ...
## $ Total.population : num 4189 7083 2502 6213 4730 ...
## $ Median.age : num 34.4 32.2 40.2 39.6 25.6 38 28.8 35 35.9 34.1 ...
## $ White : num 2073 3198 1099 3429 3427 ...
## $ Black.or.African.American : num 1687 3545 839 1806 660 ...
## $ American.Indian.or.Alaska.Native : num 0 16 6 5 0 0 0 0 4 0 ...
## $ Asian : num 162 148 248 741 395 100 638 326 106 200 ...
## $ Native.Hawaiian.and.Other.Pacific.Islander : num 0 0 18 0 0 0 0 0 0 0 ...
## $ Some.other.race : num 93 41 208 148 93 17 246 12 87 412 ...
## $ Two.or.more.races : num 174 135 84 84 155 194 88 96 319 114 ...
## $ Hispanic.or.Latino : num 456 1571 658 964 454 ...
## $ Not.Hispani.or.Latino : num 3733 5512 1844 5249 4276 ...
## $ Total.housing.units : num 2614 2995 1236 3258 2178 ...
## $ Vacant.housing.units : num 467 489 167 563 341 324 389 253 133 129 ...
## $ Median.housing.value : num 191600 169300 165700 195000 221700 ...
## $ Percent.of.less.than.9th.grade : num 1.8 11.4 8.1 12.3 2.6 0.6 16.3 13.6 7.7 15.2 ...
## $ Percent.of.9th.to.12th.grade : num 2.6 8.5 9.5 10.5 0.6 6.9 4.3 11 4.5 4 ...
## $ Percent.of.high.school.graduate : num 21.5 22.1 21.4 21.5 13.9 9.8 8.3 22.8 20.1 21.1 ...
## $ Percent.of.some.college : num 31.8 18.7 20.8 19.9 9.7 29.6 18.7 9 16 16 ...
## $ Percent.of.associate.s.degree : num 3.8 7.3 6.2 3.4 5.1 3.9 4.5 1.9 4.6 2.3 ...
## $ Percent.of.bachelor.s.degree : num 22.3 21.2 19.4 19.5 31.3 28.4 33.5 23.4 28.2 23.7 ...
## $ Percent.of.graduate.or.professional.degree : num 16.2 10.7 14.5 12.8 36.7 20.8 14.5 18.3 18.9 17.7 ...
## $ Percent.of.employed.population : num 63.6 58.2 54.4 56.7 56.5 64.2 58.5 52.5 70.9 68.2 ...
## $ Percent.of.unemployed.population : num 11.3 7.5 6.3 5.2 5.9 6.8 4.5 4.5 5.2 8.6 ...
## $ Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining : num 0 0 0 1.6 0.6 0 0 0.3 0.4 0 ...
## $ Percent.of.population.of.construction : num 6.2 2.4 0.6 3.8 0.9 4.1 0 2.4 6.7 3.4 ...
## $ Percent.of.population.of.manufacturing : num 4.7 4.7 0.7 9.8 2.9 4.1 3.2 6.4 4.9 5 ...
## $ Percent.of.population.of.wholesale.trade : num 0.8 0.7 1.7 0.4 1.1 1.2 0.6 0 1.5 2 ...
## $ Percent.of.population.of.retail.trade : num 6.2 12.3 11.7 7.8 11 7.6 6.4 12.4 11.7 7.8 ...
## $ Percent.of.population.of.transportation..warehousing..and.utilities : num 5.8 10.6 2.3 2.1 1.7 4 3 4.5 2.6 3.7 ...
## $ Percent.of.population.of.information : num 0.6 2 4 2 1.6 3.8 3.5 0.8 3.6 3 ...
## $ Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing : num 5.8 4.3 2 5.6 6.2 9.5 6 11.4 7.9 7.7 ...
## $ Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services: num 14.1 11.3 8.8 19.6 12 8.7 10.4 9.1 13.2 12.5 ...
## $ Percent.of.population.of.educational.services..health.care..and.social.assistance : num 36.6 31.4 40.1 26.6 34.9 32.1 28.3 27.4 27.4 26.5 ...
## $ Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services : num 9.3 7.6 16 12.1 18.3 11.7 28.9 15.4 13.9 18 ...
## $ Percent.of.population.of.public.administration : num 5.1 8.3 11.3 6.2 7.4 5.9 6.1 8.1 5.3 9.5 ...
## $ Percent.of.population.of.other.services : num 4.7 4.4 0.8 2.4 1.4 7.3 3.5 1.9 1.1 0.8 ...
## $ Mean.income : num 58908 68583 54897 83002 96641 ...
## - attr(*, "na.action")= 'omit' Named int [1:2] 800 801
## ..- attr(*, "names")= chr [1:2] "800" "801"
#We have one character. Converting to number
dataset_s1_copy = dataset_before_cleaning_s1
summary(dataset_s1)
## Census.tract The.number.of.crime.incidents Total.population Median.age
## Min. : 101 Min. : 3.0 Min. : 0 Min. :17.50
## 1st Qu.:1608 1st Qu.: 154.5 1st Qu.: 2034 1st Qu.:30.20
## Median :3514 Median : 259.0 Median : 3128 Median :33.10
## Mean :4039 Mean : 334.3 Mean : 3424 Mean :34.23
## 3rd Qu.:6702 3rd Qu.: 413.5 3rd Qu.: 4504 3rd Qu.:38.10
## Max. :8439 Max. :3217.0 Max. :17582 Max. :63.50
## White Black.or.African.American American.Indian.or.Alaska.Native
## Min. : 0 Min. : 0 Min. : 0.000
## 1st Qu.: 118 1st Qu.: 78 1st Qu.: 0.000
## Median :1439 Median : 350 Median : 0.000
## Mean :1663 Mean :1087 Mean : 9.083
## 3rd Qu.:2694 3rd Qu.:1658 3rd Qu.: 7.000
## Max. :8764 Max. :7063 Max. :219.000
## Asian Native.Hawaiian.and.Other.Pacific.Islander Some.other.race
## Min. : 0.0 Min. : 0.0000 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 11
## Median : 46.0 Median : 0.0000 Median : 92
## Mean : 195.9 Mean : 0.9099 Mean : 392
## 3rd Qu.: 221.5 3rd Qu.: 0.0000 3rd Qu.: 392
## Max. :6691.0 Max. :65.0000 Max. :4937
## Two.or.more.races Hispanic.or.Latino Not.Hispani.or.Latino Total.housing.units
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 15.00 1st Qu.: 74.5 1st Qu.: 1288 1st Qu.: 888
## Median : 51.00 Median : 351.0 Median : 2133 Median : 1340
## Mean : 75.97 Mean : 988.2 Mean : 2435 Mean : 1502
## 3rd Qu.:106.50 3rd Qu.:1333.0 3rd Qu.: 3296 3rd Qu.: 1886
## Max. :750.00 Max. :7256.0 Max. :16353 Max. :12190
## Vacant.housing.units Median.housing.value Percent.of.less.than.9th.grade
## Min. : 0.0 Min. : 55700 Min. : 0.000
## 1st Qu.: 89.5 1st Qu.:146050 1st Qu.: 2.700
## Median : 161.0 Median :207900 Median : 5.800
## Mean : 203.9 Mean :239365 Mean : 9.193
## 3rd Qu.: 252.0 3rd Qu.:297750 3rd Qu.:12.150
## Max. :2023.0 Max. :814300 Max. :45.200
## Percent.of.9th.to.12th.grade Percent.of.high.school.graduate
## Min. : 0.000 Min. : 0.40
## 1st Qu.: 4.300 1st Qu.:16.00
## Median : 9.100 Median :25.90
## Mean : 9.927 Mean :24.27
## 3rd Qu.:14.300 3rd Qu.:33.10
## Max. :41.900 Max. :56.80
## Percent.of.some.college Percent.of.associate.s.degree
## Min. : 2.00 Min. : 0.000
## 1st Qu.:12.45 1st Qu.: 3.400
## Median :18.20 Median : 5.000
## Mean :18.73 Mean : 5.462
## 3rd Qu.:24.60 3rd Qu.: 7.150
## Max. :42.90 Max. :19.000
## Percent.of.bachelor.s.degree Percent.of.graduate.or.professional.degree
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 7.60 1st Qu.: 3.20
## Median :15.20 Median : 8.20
## Mean :19.51 Mean :12.91
## 3rd Qu.:28.10 3rd Qu.:20.50
## Max. :66.20 Max. :70.50
## Percent.of.employed.population Percent.of.unemployed.population
## Min. : 0.00 Min. : 0.400
## 1st Qu.:43.80 1st Qu.: 5.500
## Median :56.70 Median : 8.400
## Mean :55.74 Mean : 9.525
## 3rd Qu.:65.25 3rd Qu.:12.350
## Max. :90.60 Max. :74.100
## Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1445
## 3rd Qu.:0.0000
## Max. :4.0000
## Percent.of.population.of.construction Percent.of.population.of.manufacturing
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.100 1st Qu.: 4.400
## Median : 3.000 Median : 7.300
## Mean : 3.876 Mean : 9.026
## 3rd Qu.: 5.900 3rd Qu.:12.100
## Max. :21.500 Max. :34.400
## Percent.of.population.of.wholesale.trade Percent.of.population.of.retail.trade
## Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.70 1st Qu.: 6.500
## Median : 1.80 Median : 8.800
## Mean : 2.25 Mean : 9.346
## 3rd Qu.: 3.30 3rd Qu.:11.800
## Max. :22.00 Max. :31.200
## Percent.of.population.of.transportation..warehousing..and.utilities
## Min. : 0.000
## 1st Qu.: 3.100
## Median : 5.600
## Mean : 6.597
## 3rd Qu.: 9.200
## Max. :33.700
## Percent.of.population.of.information
## Min. : 0.000
## 1st Qu.: 0.700
## Median : 1.900
## Mean : 2.261
## 3rd Qu.: 3.400
## Max. :12.700
## Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing
## Min. : 0.000
## 1st Qu.: 3.900
## Median : 6.500
## Mean : 7.346
## 3rd Qu.: 9.900
## Max. :26.500
## Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services
## Min. : 0.00
## 1st Qu.: 9.00
## Median :12.30
## Mean :13.93
## 3rd Qu.:17.20
## Max. :40.00
## Percent.of.population.of.educational.services..health.care..and.social.assistance
## Min. : 3.20
## 1st Qu.:17.50
## Median :23.00
## Mean :24.17
## 3rd Qu.:30.15
## Max. :65.90
## Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services
## Min. : 0.00
## 1st Qu.: 6.85
## Median :10.10
## Mean :11.17
## 3rd Qu.:14.40
## Max. :40.10
## Percent.of.population.of.public.administration
## Min. : 0.00
## 1st Qu.: 3.20
## Median : 5.00
## Mean : 5.27
## 3rd Qu.: 6.80
## Max. :20.70
## Percent.of.population.of.other.services Mean.income
## Min. : 0.000 Min. : 4197
## 1st Qu.: 1.700 1st Qu.: 46270
## Median : 3.300 Median : 62842
## Mean : 4.616 Mean : 83423
## 3rd Qu.: 6.350 3rd Qu.: 97354
## Max. :28.000 Max. :399454
#This will give us certain information if we need scaling or not
From the summary statistics, we can observe some abnormal distributions in most of the variables which will play a major role during the fit of the model. These have to be avoided by normalization and generalization methods before fitting the model. This process will be done before fitting the model.
#EXPLORATORY DATA ANALYSIS ## Distribution of crime occurences
ggplot(dataset_s1_copy,aes(x=`The number of crime incidents`)) +
geom_histogram(aes(y = ..density..), binwidth = 70, fill="cornsilk",color="black") +
geom_density(adjust=.8, fill="cyan",color="black", alpha=0.4) +
labs(x="Crime Occurences",
y="Density",
title="Number of Crime Occurences Distribution")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We have a lot of crime occurrences occurring in the range of 0 to 1000. The separation for EDA purpose will be done based on that.
dataset_s1_copy$crime_categories <- ifelse(dataset_s1_copy$`The number of crime incidents` <= 200, 'Less number of crimes (<200)',
ifelse(dataset_s1_copy$`The number of crime incidents` > 200 &
dataset_s1_copy$`The number of crime incidents` <= 500, '200 to 500 crime rate',
ifelse(dataset_s1_copy$`The number of crime incidents` > 500 &
dataset_s1_copy$`The number of crime incidents` < 800, '500 to 800 crime rate',
'High Crime Rate')))
table(dataset_s1_copy$crime_categories)
##
## 200 to 500 crime rate 500 to 800 crime rate
## 346 100
## High Crime Rate Less number of crimes (<200)
## 52 301
average_white <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_white = mean(White, na.rm = TRUE))
ggplot(average_white, aes(x = crime_categories, y = average_white, fill = crime_categories)) +
geom_bar(stat = 'identity', color = 'black') +
labs(
title = 'Average White People - Crime Occurrences',
x = 'Crime Rates',
y = 'Average White People',
fill = "Crime Categories"
) +
scale_fill_manual(
values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
)
A census containing around 800 white people are said to do a higher crime rate. Higher number of white people are most probably said to make less number of crimes.
average_white <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_Black.or.African.American = mean(`Black or African American`, na.rm = TRUE))
ggplot(average_white, aes(x = crime_categories, y = average_Black.or.African.American, fill = crime_categories)) +
geom_bar(stat = 'identity', color = 'black') +
labs(
title = 'Average Black.or.African.American People - Crime Occurrences',
x = 'Crime Rates',
y = 'Average Black African American People',
fill = "Crime Categories"
) +
scale_fill_manual(
values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
)
average_Black.or.African.American people are said to make most crimes.
The crime rate is very high here averaging around 3000 crime rates and
being the highest.
average_white <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_American.Indian.or.Alaska.Native = mean(`American Indian or Alaska Native`, na.rm = TRUE))
ggplot(average_white, aes(x = crime_categories, y = average_American.Indian.or.Alaska.Native, fill = crime_categories)) +
geom_bar(stat = 'identity', color = 'black') +
labs(
title = 'Average American.Indian.or.Alaska.Native People - Crime Occurrences',
x = 'Crime Rates',
y = 'Average American.Indian.or.Alaska.Nativen People',
fill = "Crime Categories"
) +
scale_fill_manual(
values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
)
On an average 10 people from American Indian or Alaska Native per census
are to perform high crimes.
average_white <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_Asian = mean(Asian, na.rm = TRUE))
ggplot(average_white, aes(x = crime_categories, y = average_Asian, fill = crime_categories)) +
geom_bar(stat = 'identity', color = 'black') +
labs(
title = 'Average Asian People - Crime Occurrences',
x = 'Crime Rates',
y = 'Average Asian People',
fill = "Crime Categories"
) +
scale_fill_manual(
values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
)
Asian people are expected to make less than 200 crimes on an average. Around 200 people per census make arround 200 crimes or less. 170 people approximately make high number of crimes.
average_white <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_Native.Hawaiian.and.Other.Pacific.Islander = mean(`Native Hawaiian and Other Pacific Islander`, na.rm = TRUE))
ggplot(average_white, aes(x = crime_categories, y = average_Native.Hawaiian.and.Other.Pacific.Islander, fill = crime_categories)) +
geom_bar(stat = 'identity', color = 'black') +
labs(
title = 'Average Hawaii People - Crime Occurrences',
x = 'Crime Rates',
y = 'Average Hawaii People',
fill = "Crime Categories"
) +
scale_fill_manual(
values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
)
On an average 2 hawaii people per census are to make higher crimes which
is far far less.
#Male
average_male <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_male = mean(`Male population`, na.rm = TRUE))
plot_male <- ggplot(average_male, aes(x = "", y = average_male, fill = crime_categories)) +
geom_bar(stat = "identity", width = 1) +
geom_text(aes(label = round(average_male,2)), position = position_stack(vjust = 0.5)) +
coord_polar("y", start = 0) +
theme(legend.position = "bottom") +
labs(title = "Average Male Population - Crimes", fill = "Crime Rate", y = NULL)
average_female <- dataset_s1_copy %>%
group_by(crime_categories) %>%
summarize(average_female = mean(`Female population`, na.rm = TRUE))
# Create plot for Female
plot_female <- ggplot(average_female, aes(x = "", y = average_female, fill = crime_categories)) +
geom_bar(stat = "identity", width = 1) +
geom_text(aes(label = round(average_female, 2)), position = position_stack(vjust = 0.5)) +
coord_polar("y", start = 0) +
theme(legend.position = "bottom") +
labs(title = "Average Female Population - Crimes", fill = "Crime Rate", y = NULL)
# Combine the plots
grid.arrange(plot_male, plot_female, ncol = 2)
Average of 2580 females have caused high crimes and average of 2300 males have caused high crimes per census.
#Visualization - Density Chart - Employed Population
employed_population <- ggplot(dataset_s1_copy, aes(x = `Employed population`, fill = crime_categories)) +
geom_density(alpha = 0.3) +
labs(title = "Density plot for Employed Population", x = "Employed Population", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green", "orange"))
unemployed_population <- ggplot(dataset_s1_copy, aes(x = `Unemployed population`, fill = crime_categories)) +
geom_density(alpha = 0.3) +
labs(title = "Density plot for UnEmployed Population", x = "UnEmployed Population", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green", "orange"))
grid.arrange(employed_population, unemployed_population, ncol = 1)
We see higher crime rates in Unemployed Population as the population
increases. For Employed Population, The crime rate is less.
Unemployed people are more to cause crimes compared to the employed ones.
##Skewness Check
#Computing the skewness for All Numerical Variables
skewness_df = data.frame(Variable_Name = character(), skewness = numeric(), stringsAsFactors =
FALSE)
for(i in names(dataset_s1)){
if(is.numeric(dataset_s1[[i]])){
value = skewness(dataset_s1[[i]], na.rm=TRUE)
skewness_df = rbind(skewness_df, data.frame(Variable_Name = i, skewness = value,
stringsAsFactors = FALSE))
}
}
skewness_df
## Getting the number of variables having heavy skewness
heavy_skewness = skewness_df[skewness_df$skewness >= 3 | skewness_df$skewness <= -2, ]
heavy_skewness
dataset_before_cleaning_s1[] <- lapply(dataset_before_cleaning_s1, function(x) ifelse(is.character(x), as.numeric(x), x))
str(dataset_before_cleaning_s1)
## tibble [799 × 79] (S3: tbl_df/tbl/data.frame)
## $ Census tract : num [1:799] 101 101 101 101 101 101 101 101 101 101 ...
## $ The number of crime incidents : num [1:799] 564 564 564 564 564 564 564 564 564 564 ...
## $ Total population : num [1:799] 4189 4189 4189 4189 4189 ...
## $ Male population : num [1:799] 1742 1742 1742 1742 1742 ...
## $ Percent of male population : num [1:799] 41.6 41.6 41.6 41.6 41.6 41.6 41.6 41.6 41.6 41.6 ...
## $ Female population : num [1:799] 2447 2447 2447 2447 2447 ...
## $ Percent of female population : num [1:799] 58.4 58.4 58.4 58.4 58.4 58.4 58.4 58.4 58.4 58.4 ...
## $ Median age : num [1:799] 34.4 34.4 34.4 34.4 34.4 34.4 34.4 34.4 34.4 34.4 ...
## $ Under 16 years : num [1:799] 789 789 789 789 789 789 789 789 789 789 ...
## $ Percent of under 16 years : num [1:799] 18.8 18.8 18.8 18.8 18.8 ...
## $ Over 65 years : num [1:799] 182 182 182 182 182 182 182 182 182 182 ...
## $ Percent of over 65 years : num [1:799] 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 ...
## $ White : num [1:799] 2073 2073 2073 2073 2073 ...
## $ Black or African American : num [1:799] 1687 1687 1687 1687 1687 ...
## $ American Indian or Alaska Native : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian : num [1:799] 162 162 162 162 162 162 162 162 162 162 ...
## $ Native Hawaiian and Other Pacific Islander : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
## $ Some other race : num [1:799] 93 93 93 93 93 93 93 93 93 93 ...
## $ Two or more races : num [1:799] 174 174 174 174 174 174 174 174 174 174 ...
## $ Hispanic or Latino : num [1:799] 456 456 456 456 456 456 456 456 456 456 ...
## $ Not Hispani or Latino : num [1:799] 3733 3733 3733 3733 3733 ...
## $ Total housing units : num [1:799] 2614 2614 2614 2614 2614 ...
## $ Occupied housing units : num [1:799] 2147 2147 2147 2147 2147 ...
## $ Percent of occupied housing units : num [1:799] 82.1 82.1 82.1 82.1 82.1 82.1 82.1 82.1 82.1 82.1 ...
## $ Vacant housing units : num [1:799] 467 467 467 467 467 467 467 467 467 467 ...
## $ Percent of vacant housing units : num [1:799] 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 ...
## $ Median housing value : num [1:799] 191600 191600 191600 191600 191600 ...
## $ Over 25 years : num [1:799] 2949 2949 2949 2949 2949 ...
## $ Percent of less than 9th grade : num [1:799] 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 ...
## $ Percent of 9th to 12th grade : num [1:799] 2.6 2.6 2.6 2.6 2.6 2.6 2.6 2.6 2.6 2.6 ...
## $ Percent of high school graduate : num [1:799] 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 ...
## $ Percent of some college : num [1:799] 31.8 31.8 31.8 31.8 31.8 31.8 31.8 31.8 31.8 31.8 ...
## $ Percent of associate’s degree : num [1:799] 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 ...
## $ Percent of bachelor’s degree : num [1:799] 22.3 22.3 22.3 22.3 22.3 22.3 22.3 22.3 22.3 22.3 ...
## $ Percent of graduate or professional degree : num [1:799] 16.2 16.2 16.2 16.2 16.2 16.2 16.2 16.2 16.2 16.2 ...
## $ Percent of high school graduate or higher : num [1:799] 95.6 95.6 95.6 95.6 95.6 95.6 95.6 95.6 95.6 95.6 ...
## $ Percent of bachelor’s degree of higher : num [1:799] 38.6 38.6 38.6 38.6 38.6 38.6 38.6 38.6 38.6 38.6 ...
## $ Percent of less than high school graduate : num [1:799] 4.4 4.4 4.4 4.4 4.4 ...
## $ Percent of less than bachelor’s degree : num [1:799] 61.4 61.4 61.4 61.4 61.4 61.4 61.4 61.4 61.4 61.4 ...
## $ Over 16 years : num [1:799] 3400 3400 3400 3400 3400 3400 3400 3400 3400 3400 ...
## $ Percent of over 16 years : num [1:799] 81.2 81.2 81.2 81.2 81.2 ...
## $ Population in labor force : num [1:799] 2546 2546 2546 2546 2546 ...
## $ Percent of population in labor force : num [1:799] 74.9 74.9 74.9 74.9 74.9 74.9 74.9 74.9 74.9 74.9 ...
## $ Population not in labor force : num [1:799] 854 854 854 854 854 854 854 854 854 854 ...
## $ Percent of population not in labor force : num [1:799] 25.1 25.1 25.1 25.1 25.1 25.1 25.1 25.1 25.1 25.1 ...
## $ Employed population : num [1:799] 2161 2161 2161 2161 2161 ...
## $ Percent of employed population : num [1:799] 63.6 63.6 63.6 63.6 63.6 63.6 63.6 63.6 63.6 63.6 ...
## $ Unemployed population : num [1:799] 385 385 385 385 385 385 385 385 385 385 ...
## $ Percent of unemployed population : num [1:799] 11.3 11.3 11.3 11.3 11.3 11.3 11.3 11.3 11.3 11.3 ...
## $ Population not in labor force and unemployed population : num [1:799] 1239 1239 1239 1239 1239 ...
## $ Percent of population not in labor force and unemployed population : num [1:799] 36.4 36.4 36.4 36.4 36.4 36.4 36.4 36.4 36.4 36.4 ...
## $ Population of agriculture, forestry, fishing, hunting, and mining : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
## $ Percent of population of agriculture, forestry, fishing, hunting, and mining : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
## $ Population of construction : num [1:799] 135 135 135 135 135 135 135 135 135 135 ...
## $ Percent of population of construction : num [1:799] 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 ...
## $ Population of manufacturing : num [1:799] 102 102 102 102 102 102 102 102 102 102 ...
## $ Percent of population of manufacturing : num [1:799] 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 ...
## $ Population of wholesale trade : num [1:799] 18 18 18 18 18 18 18 18 18 18 ...
## $ Percent of population of wholesale trade : num [1:799] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
## $ Population of retail trade : num [1:799] 133 133 133 133 133 133 133 133 133 133 ...
## $ Percent of population of retail trade : num [1:799] 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 ...
## $ Population of transportation, warehousing, and utilities : num [1:799] 125 125 125 125 125 125 125 125 125 125 ...
## $ Percent of population of transportation, warehousing, and utilities : num [1:799] 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 ...
## $ Population of information : num [1:799] 13 13 13 13 13 13 13 13 13 13 ...
## $ Percent of population of information : num [1:799] 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 ...
## $ Population of finance, insurance, real estate, rental, and leasing : num [1:799] 126 126 126 126 126 126 126 126 126 126 ...
## $ Percent of population of finance, insurance, real estate, rental, and leasing : num [1:799] 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 ...
## $ Population of professional, scientific, management, administrative, and waste management services : num [1:799] 304 304 304 304 304 304 304 304 304 304 ...
## $ Percent of population of professional, scientific, management, administrative, and waste management services: num [1:799] 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 ...
## $ Population of educational services, health care, and social assistance : num [1:799] 791 791 791 791 791 791 791 791 791 791 ...
## $ Percent of population of educational services, health care, and social assistance : num [1:799] 36.6 36.6 36.6 36.6 36.6 36.6 36.6 36.6 36.6 36.6 ...
## $ Population of arts, entertainment, recreation, accommodation, and food services : num [1:799] 202 202 202 202 202 202 202 202 202 202 ...
## $ Percent of population of arts, entertainment, recreation, accommodation, and food services : num [1:799] 9.3 9.3 9.3 9.3 9.3 9.3 9.3 9.3 9.3 9.3 ...
## $ Population of public administration : num [1:799] 111 111 111 111 111 111 111 111 111 111 ...
## $ Percent of population of public administration : num [1:799] 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 ...
## $ Population of other services : num [1:799] 101 101 101 101 101 101 101 101 101 101 ...
## $ Percent of population of other services : num [1:799] 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 ...
## $ Median income : num [1:799] 44826 44826 44826 44826 44826 ...
## $ Mean income : num [1:799] 58908 58908 58908 58908 58908 ...
## - attr(*, "na.action")= 'omit' Named int [1:2] 800 801
## ..- attr(*, "names")= chr [1:2] "800" "801"
crime_data = read.csv("/Users/jasonrayen/Downloads/Jason Masters/Sem 3/AIT 664 (Information, Visualization & Representation)/Project/dataset_s1_cleaned2.csv")
head(crime_data)
str(crime_data)
## 'data.frame': 801 obs. of 39 variables:
## $ Census.tract : num 101 102 102 103 104 ...
## $ The.number.of.crime.incidents : chr "564" "452" "458" "360" ...
## $ Total.population : int 4189 7083 2502 6213 4730 3831 3690 2228 6513 3946 ...
## $ Median.age : num 34.4 32.2 40.2 39.6 25.6 38 28.8 35 35.9 34.1 ...
## $ White : int 2073 3198 1099 3429 3427 2330 2123 1577 4087 2565 ...
## $ Black.or.African.American : int 1687 3545 839 1806 660 1190 595 217 1910 655 ...
## $ American.Indian.or.Alaska.Native : int 0 16 6 5 0 0 0 0 4 0 ...
## $ Asian : int 162 148 248 741 395 100 638 326 106 200 ...
## $ Native.Hawaiian.and.Other.Pacific.Islander : int 0 0 18 0 0 0 0 0 0 0 ...
## $ Some.other.race : int 93 41 208 148 93 17 246 12 87 412 ...
## $ Two.or.more.races : int 174 135 84 84 155 194 88 96 319 114 ...
## $ Hispanic.or.Latino : int 456 1571 658 964 454 191 578 374 1715 1654 ...
## $ Not.Hispani.or.Latino : int 3733 5512 1844 5249 4276 3640 3112 1854 4798 2292 ...
## $ Total.housing.units : int 2614 2995 1236 3258 2178 2559 1956 1343 3261 1658 ...
## $ Vacant.housing.units : int 467 489 167 563 341 324 389 253 133 129 ...
## $ Median.housing.value : num 191600 169300 165700 195000 221700 ...
## $ Percent.of.less.than.9th.grade : num 1.8 11.4 8.1 12.3 2.6 0.6 16.3 13.6 7.7 15.2 ...
## $ Percent.of.9th.to.12th.grade : num 2.6 8.5 9.5 10.5 0.6 6.9 4.3 11 4.5 4 ...
## $ Percent.of.high.school.graduate : num 21.5 22.1 21.4 21.5 13.9 9.8 8.3 22.8 20.1 21.1 ...
## $ Percent.of.some.college : num 31.8 18.7 20.8 19.9 9.7 29.6 18.7 9 16 16 ...
## $ Percent.of.associate.s.degree : num 3.8 7.3 6.2 3.4 5.1 3.9 4.5 1.9 4.6 2.3 ...
## $ Percent.of.bachelor.s.degree : num 22.3 21.2 19.4 19.5 31.3 28.4 33.5 23.4 28.2 23.7 ...
## $ Percent.of.graduate.or.professional.degree : num 16.2 10.7 14.5 12.8 36.7 20.8 14.5 18.3 18.9 17.7 ...
## $ Percent.of.employed.population : num 63.6 58.2 54.4 56.7 56.5 64.2 58.5 52.5 70.9 68.2 ...
## $ Percent.of.unemployed.population : num 11.3 7.5 6.3 5.2 5.9 6.8 4.5 4.5 5.2 8.6 ...
## $ Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining : num 0 0 0 1.6 0.6 0 0 0.3 0.4 0 ...
## $ Percent.of.population.of.construction : num 6.2 2.4 0.6 3.8 0.9 4.1 0 2.4 6.7 3.4 ...
## $ Percent.of.population.of.manufacturing : num 4.7 4.7 0.7 9.8 2.9 4.1 3.2 6.4 4.9 5 ...
## $ Percent.of.population.of.wholesale.trade : num 0.8 0.7 1.7 0.4 1.1 1.2 0.6 0 1.5 2 ...
## $ Percent.of.population.of.retail.trade : num 6.2 12.3 11.7 7.8 11 7.6 6.4 12.4 11.7 7.8 ...
## $ Percent.of.population.of.transportation..warehousing..and.utilities : num 5.8 10.6 2.3 2.1 1.7 4 3 4.5 2.6 3.7 ...
## $ Percent.of.population.of.information : num 0.6 2 4 2 1.6 3.8 3.5 0.8 3.6 3 ...
## $ Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing : num 5.8 4.3 2 5.6 6.2 9.5 6 11.4 7.9 7.7 ...
## $ Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services: num 14.1 11.3 8.8 19.6 12 8.7 10.4 9.1 13.2 12.5 ...
## $ Percent.of.population.of.educational.services..health.care..and.social.assistance : num 36.6 31.4 40.1 26.6 34.9 32.1 28.3 27.4 27.4 26.5 ...
## $ Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services : num 9.3 7.6 16 12.1 18.3 11.7 28.9 15.4 13.9 18 ...
## $ Percent.of.population.of.public.administration : num 5.1 8.3 11.3 6.2 7.4 5.9 6.1 8.1 5.3 9.5 ...
## $ Percent.of.population.of.other.services : num 4.7 4.4 0.8 2.4 1.4 7.3 3.5 1.9 1.1 0.8 ...
## $ Mean.income : num 58908 68583 54897 83002 96641 ...
crime_data$The.number.of.crime.incidents <- as.numeric(crime_data$The.number.of.crime.incidents)
## Warning: NAs introduced by coercion
print(which(is.na(crime_data$The.number.of.crime.incidents)))
## [1] 800 801
crime_data <- crime_data[!is.na(crime_data$The.number.of.crime.incidents), ]
str(crime_data)
## 'data.frame': 799 obs. of 39 variables:
## $ Census.tract : num 101 102 102 103 104 ...
## $ The.number.of.crime.incidents : num 564 452 458 360 225 192 110 110 367 175 ...
## $ Total.population : int 4189 7083 2502 6213 4730 3831 3690 2228 6513 3946 ...
## $ Median.age : num 34.4 32.2 40.2 39.6 25.6 38 28.8 35 35.9 34.1 ...
## $ White : int 2073 3198 1099 3429 3427 2330 2123 1577 4087 2565 ...
## $ Black.or.African.American : int 1687 3545 839 1806 660 1190 595 217 1910 655 ...
## $ American.Indian.or.Alaska.Native : int 0 16 6 5 0 0 0 0 4 0 ...
## $ Asian : int 162 148 248 741 395 100 638 326 106 200 ...
## $ Native.Hawaiian.and.Other.Pacific.Islander : int 0 0 18 0 0 0 0 0 0 0 ...
## $ Some.other.race : int 93 41 208 148 93 17 246 12 87 412 ...
## $ Two.or.more.races : int 174 135 84 84 155 194 88 96 319 114 ...
## $ Hispanic.or.Latino : int 456 1571 658 964 454 191 578 374 1715 1654 ...
## $ Not.Hispani.or.Latino : int 3733 5512 1844 5249 4276 3640 3112 1854 4798 2292 ...
## $ Total.housing.units : int 2614 2995 1236 3258 2178 2559 1956 1343 3261 1658 ...
## $ Vacant.housing.units : int 467 489 167 563 341 324 389 253 133 129 ...
## $ Median.housing.value : num 191600 169300 165700 195000 221700 ...
## $ Percent.of.less.than.9th.grade : num 1.8 11.4 8.1 12.3 2.6 0.6 16.3 13.6 7.7 15.2 ...
## $ Percent.of.9th.to.12th.grade : num 2.6 8.5 9.5 10.5 0.6 6.9 4.3 11 4.5 4 ...
## $ Percent.of.high.school.graduate : num 21.5 22.1 21.4 21.5 13.9 9.8 8.3 22.8 20.1 21.1 ...
## $ Percent.of.some.college : num 31.8 18.7 20.8 19.9 9.7 29.6 18.7 9 16 16 ...
## $ Percent.of.associate.s.degree : num 3.8 7.3 6.2 3.4 5.1 3.9 4.5 1.9 4.6 2.3 ...
## $ Percent.of.bachelor.s.degree : num 22.3 21.2 19.4 19.5 31.3 28.4 33.5 23.4 28.2 23.7 ...
## $ Percent.of.graduate.or.professional.degree : num 16.2 10.7 14.5 12.8 36.7 20.8 14.5 18.3 18.9 17.7 ...
## $ Percent.of.employed.population : num 63.6 58.2 54.4 56.7 56.5 64.2 58.5 52.5 70.9 68.2 ...
## $ Percent.of.unemployed.population : num 11.3 7.5 6.3 5.2 5.9 6.8 4.5 4.5 5.2 8.6 ...
## $ Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining : num 0 0 0 1.6 0.6 0 0 0.3 0.4 0 ...
## $ Percent.of.population.of.construction : num 6.2 2.4 0.6 3.8 0.9 4.1 0 2.4 6.7 3.4 ...
## $ Percent.of.population.of.manufacturing : num 4.7 4.7 0.7 9.8 2.9 4.1 3.2 6.4 4.9 5 ...
## $ Percent.of.population.of.wholesale.trade : num 0.8 0.7 1.7 0.4 1.1 1.2 0.6 0 1.5 2 ...
## $ Percent.of.population.of.retail.trade : num 6.2 12.3 11.7 7.8 11 7.6 6.4 12.4 11.7 7.8 ...
## $ Percent.of.population.of.transportation..warehousing..and.utilities : num 5.8 10.6 2.3 2.1 1.7 4 3 4.5 2.6 3.7 ...
## $ Percent.of.population.of.information : num 0.6 2 4 2 1.6 3.8 3.5 0.8 3.6 3 ...
## $ Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing : num 5.8 4.3 2 5.6 6.2 9.5 6 11.4 7.9 7.7 ...
## $ Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services: num 14.1 11.3 8.8 19.6 12 8.7 10.4 9.1 13.2 12.5 ...
## $ Percent.of.population.of.educational.services..health.care..and.social.assistance : num 36.6 31.4 40.1 26.6 34.9 32.1 28.3 27.4 27.4 26.5 ...
## $ Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services : num 9.3 7.6 16 12.1 18.3 11.7 28.9 15.4 13.9 18 ...
## $ Percent.of.population.of.public.administration : num 5.1 8.3 11.3 6.2 7.4 5.9 6.1 8.1 5.3 9.5 ...
## $ Percent.of.population.of.other.services : num 4.7 4.4 0.8 2.4 1.4 7.3 3.5 1.9 1.1 0.8 ...
## $ Mean.income : num 58908 68583 54897 83002 96641 ...
#importing libraries
library(ggplot2)
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data <- crime_data
data <- mutate(data,
Percent.of.less.than.high.school = rowSums(select(data, starts_with("Percent.of.less.than.9th.grade"), "Percent.of.9th.to.12th.grade")) / data$Total.population * 100,
Percent.of.high.school.graduate.or.higher = rowSums(select(data, starts_with("Percent.of.high.school.graduate"), ends_with("Percent.of.graduate.or.professional.degree"))) / data$Total.population * 100
)
library(ggplot2)
library(plotly)
# Combine all education columns into a single dataframe
education_data <- data.frame(
Less_than_9th = crime_data$Percent.of.less.than.9th.grade,
Grade_9th_to_12th = crime_data$Percent.of.9th.to.12th.grade,
High_school_graduate = crime_data$Percent.of.high.school.graduate,
Some_college = crime_data$Percent.of.some.college,
Associate_degree = crime_data$Percent.of.associate.s.degree,
Bachelor_degree = crime_data$Percent.of.bachelor.s.degree,
Graduate_professional_degree = crime_data$Percent.of.graduate.or.professional.degree,
Crime_incidents = crime_data$The.number.of.crime.incidents
)
# Filter data for points below 2000 on the y-axis
filtered_data <- subset(education_data, Crime_incidents < 2000)
# Reshape data for ggplot
education_data_long <- tidyr::pivot_longer(filtered_data,
cols = -Crime_incidents,
names_to = "Education_column",
values_to = "Percent")
# Define the order of education columns
education_order <- c("Less_than_9th", "Grade_9th_to_12th", "High_school_graduate",
"Some_college", "Associate_degree", "Bachelor_degree",
"Graduate_professional_degree")
# Convert Education_column to factor with defined order
education_data_long$Education_column <- factor(education_data_long$Education_column, levels = education_order)
# Plot using ggplot
p <- ggplot(education_data_long, aes(x = Percent, y = Crime_incidents, color = Education_column)) +
geom_point(size = 0.5) +
labs(x = "Percent of Education Level",
y = "Number of Crime Incidents",
title = "Crime Incidents vs Education Level") +
theme_light()
# Convert ggplot to plotly
plotly_plot <- ggplotly(p)
# Show the interactive plot
plotly_plot
library(dplyr)
library(plotly)
# Create bins for median age
crime_data <- mutate(crime_data,
age_bin = case_when(
Median.age >= 20 & Median.age < 30 ~ "20-30",
Median.age >= 30 & Median.age < 40 ~ "30-40",
Median.age >= 40 & Median.age < 50 ~ "40-50",
Median.age >= 50 & Median.age < 60 ~ "50-60",
TRUE ~ "Other"
))
# Group by age bins and count the number of crime incidents
crime_counts <- crime_data %>%
group_by(age_bin) %>%
summarize(crime_count = sum(The.number.of.crime.incidents))
# Create an interactive bar plot with Plotly
plot_ly(crime_counts, x = ~age_bin, y = ~crime_count, type = "bar") %>%
layout(title = "Crime Incidents by Age",
xaxis = list(title = "Age Group"),
yaxis = list(title = "Count of Crime Incidents"))
crime_data$Housing_Level <- cut(crime_data$Median.housing.value, breaks = 3, labels = c("Low", "Medium", "High"))
p <- ggplot(crime_data, aes(x = Housing_Level, y = The.number.of.crime.incidents, fill = Housing_Level)) +
geom_boxplot() +
labs(x = "Housing Value",
y = "Crime Incidents Level",
title = "Relationship Between Housing Value and Crime Incidents") +
scale_fill_manual(values = c("Low" = "lightblue", "Medium" = "lightgreen", "High" = "lightcoral")) +
theme_minimal()
# Calculate the 1st and 3rd quartiles
q1 <- quantile(crime_data$The.number.of.crime.incidents, probs = 0.25)
q3 <- quantile(crime_data$The.number.of.crime.incidents, probs = 0.75)
# Calculate the interquartile range (IQR)
iqr <- q3 - q1
# Set the upper and lower bounds for outliers
upper_bound <- q3 + 1.5 * iqr
lower_bound <- q1 - 1.5 * iqr
# Create a filtered version of the data without outliers
filtered_data <- subset(crime_data, The.number.of.crime.incidents >= lower_bound & The.number.of.crime.incidents <= upper_bound)
# Create a boxplot with the filtered data
p <- ggplot(filtered_data, aes(x = Housing_Level, y = The.number.of.crime.incidents, fill = Housing_Level)) +
geom_boxplot() +
labs(x = "Housing Value",
y = "Crime Incidents Level",
title = "Relationship Between Housing Value and Crime Incidents") +
scale_fill_manual(values = c("Low" = "lightblue", "Medium" = "lightgreen", "High" = "lightcoral")) +
theme_minimal()
# Show the plot
print(p)